QUESTION: Which songs have the lowest and highest overall scores?

To answer this question we will look at all songs across all algorithms.

First, we will sum their scores and sort them to find the lowest and highest scoring songs, then graph the results.


In [9]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.signal import argrelmax, argrelmin
%matplotlib inline

In [10]:
def data_path(year, d_type='results'):
    code = os.getcwd().split("/")
    root = os.path.join("/", *code[:-1])
    if d_type == 'results':
        the_path = os.path.join(root, 'datasets', str(year), 'result_sets')
    elif d_type == 'groundtruth':
        the_path = os.path.join(root, 'datasets', str(year), 'ground_truth')
    elif d_type == 'outputs':
        the_path = os.path.join(root, 'datasets', str(year), 'outputs')
    assert os.path.isdir(the_path), 'no such path exists: {0}'.format(the_path)
    return the_path


def competition_files(year, d_type='results', ext='csv'):
    root = data_path(year, d_type)
    orig_dir = os.getcwd()
    os.chdir(root)
    file_struct = os.walk(root)
    comp_files = []
    for path, dirs, files in file_struct:
        for file in files:
            if ext == file.split(".")[-1]:
                comp_files.append(os.path.join(path, file))
    os.chdir(orig_dir)
    return comp_files


def competition_results(year, header=0):
    comp_results = {}
    comp_files = competition_files(year)
    songs_used = []
    for filename in comp_files:
        comp_results[filename.split("/")[-1]] = pd.read_csv(filename, header=header)

        #change values in column
        if year == 2009:
            comp_results[filename.split("/")[-1]]['filename'] = comp_results[filename.split("/")[-1]]['filename'].apply(lambda x: x.split(".")[0].lower().replace("__", "_-_"))
            songs_used = songs_used + comp_results[filename.split("/")[-1]].filename.unique().tolist()
        else:
            songs_used = songs_used + comp_results[filename.split("/")[-1]].File.unique().tolist()
  
    try:
        songs_used.remove('ave')
        songs_used.remove('weighted ave')
    except ValueError:
        pass
    
    return comp_results, np.array(list(set(songs_used)))


def songs_by_score(songs_and_sums):
    return sorted([(song, score) for song, score in songs_and_sums.items()], key=lambda x: x[1])


def summed_overlaps(comp_results, songs_used, col="Overlap_Score", file="filename"):
    # Go through each submitted algorithms results:
    songs = dict(zip(songs_used, [0]*len(songs_used)))
    for algorithm in comp_results:
        # For this algorithm, check all songs
        for idx, row in comp_results[algorithm].iterrows():
            songs[row[file]] = songs[row[file]] + row[col]
    return songs


def show_top_bottom_ten(overlap_sums, year):
    t = list(overlap_sums.items())
    t.sort(key=lambda x: x[1])

    plt.figure(figsize=(15,5))
    #plot bottom 10:
    plt.plot(range(10), [score for song, score in t[:10]])
    #plot top 10:
    plt.plot(range(10, 20), [score for song, score in t[-10:]])    
    plt.ylabel("Total overlap, year {0}".format(year))
    plt.xlabel("Song")
    
    song_labels = [song for song, score in t[:10]] + [song for song, score in t[-10:]]
    #plot song labels for bottom & top ten:
    plt.xticks(range(20), song_labels, rotation='vertical')
    
    plt.title("Lowest and Highest 10 Songs, year {0}".format(year))
    plt.show()
    return t
    
    
def show_all_scores(overlap_sums, year):
    values = np.array(list(overlap_sums.values()))

    plt.figure(figsize=(20,20))
    plt.plot(range(len(overlap_sums)), values)

    plt.ylabel("Total overlap, year {0}".format(year))
    plt.xlabel("Song")
    plt.title("Total Overlap Score for All Songs, year {0}".format(year))
    plt.show()

In [11]:
results_2009, songs_2009 = competition_results(2009)
overlap_sums_2009 = summed_overlaps(results_2009, songs_2009)
sorted_song_totals_2009 = show_top_bottom_ten(overlap_sums_2009, 2009)


Here we have both the top 10 songs by overall score, as well as the bottom 10. Scores here are totalled across all submitted algorithms.

The next question to answer is, do these songs have anything in common?

To investigate this we will examine the ground truth files for each of the 10 lowest and highest scorers. We will also look at the top and bottome 10 scorers in other years on the same dataset. The reasoning is that it there is something unique about the song that makes it difficult to predict the chords therein, that difficulty will persist across time. If not, then it can be concluded as a failing of the 2009 submissions.

Unfortunately, MIREX decided to change how it runs the competition after 2009. They claim to be using the same dataset, however now all of the songs have been relabled things like 'chord_mrx_09_000001', making it impossible to determine if the same songs have the lowest overlap score in subsequent years. Also, the overlap score is no-longer provided.

QUESTION: How do the overlaps compare across algorithms?

To answer this question we will order the songs alphabetically and plot the score for each, by algorithm. Then, we will overlay the plots to see the change in each algorithm's predictive powers over the set.


In [12]:
def compare_all_algo_overlaps(competition_results, songs, year, col="Overlap_Score"):
    plt.figure(figsize=(60,20))
    for result_set in competition_results.values():
        #iter through each algo 
        #plot this algo's overlap score.
        
        #get the overlap scores for this algo in a numpy array:
        overlaps = result_set[col].as_matrix()
        end = overlaps.shape[0]
        plt.plot(range(end), overlaps)

    plt.ylabel("Total overlap, year {0}".format(year))
    plt.xlabel("Song")
    plt.title("Total Overlap Score for All Songs, year {0}".format(year))
    plt.xticks(range(end), [song for song in songs], rotation='vertical')
    plt.show()
    
compare_all_algo_overlaps(results_2009, songs_2009, 2009)


What we observe here is that many of the peaks and valleys are shared across submissions. In other words, these differing algorithms all have trouble or ease with the same songs, generally speaking.

Let's find out what songs are a local minimum for all 13 submissions at the same time.


In [13]:
def bad_score_by_most_submissions(competition_results, songs, year, cutoff):
        min_idxes = []
        
        song_idx = []
        
        for result_set in competition_results.values():
            
            algos_min_idxes = argrelmin(np.array(list(result_set.Overlap_Score.tolist())))[0]
            
            for idx in algos_min_idxes:
                min_idxes.append(idx)
                    
        for idx in min_idxes:
            if min_idxes.count(idx) >= cutoff and idx not in song_idx:
                song_idx.append(idx)
        
        return songs[song_idx]

In [14]:
common_bad_songs_2009 = bad_score_by_most_submissions(results_2009, 
                                                      songs_2009, 
                                                      2009, 
                                                      13)

for song in common_bad_songs_2009:
    print(song)


03_-_she
04_-_nowhere_man
06_-_i_want_you
08_-_strawberry_fields_forever
cd2_-_13_-_good_night

Let's investigate whether or not these songs have something in common and if this is why most algorithms submitted did poorly on them


In [18]:
def ground_truth(year):
    ground_truth = {}
    truth_files = competition_files(year, d_type='groundtruth', ext='lab')
    for filename in truth_files:
        ground_truth[filename.split("/")[-1].split(".")[0].lower().replace("\'", "")] = pd.read_csv(filename, sep=" ",
                                                            names=['onset', 'offset', 'chord'])
    return ground_truth

In [19]:
ground_truth_2009 = ground_truth(2009)

In [21]:
#print(common_bad_songs_2009[0])
#ground_truth_2009['01_']
#ground_truth_2009[common_bad_songs_2009[0]]

#result = pd.concat([df.chord for df in ground_truth_2009.values()], axis=1)

In [22]:
#result

I see a lot of chords in minor keys. I am beginning to wonder if this has something to do with it?

I could look into the average amount of min/major/dim/#/and any chord modified with a number -- then I can check if this 5 songs differ in some way from the norm.

QUESTION: What is the average score of each algorithm?

QUESTION: What is the highest average scoring algorithm?

QUESTION: By how much is it better?

QUESTION: How much better is the average score when you choose the best algorithm each time?

QUESTION: Is they any way to predict which guess is best out of all?


In [23]:
#What is the average score of each algorithm? (overlap score)
def algorithms_ave(results):
    averages = []
    
    for algorithm in results.keys():
        averages.append((algorithm, results[algorithm].loc[results[algorithm]['filename'] == 'ave'].Overlap_Score.tolist()[0]))

    return sorted(averages, key=lambda x: x[1], reverse=True)

In [24]:
averages_2009 = algorithms_ave(results_2009)

In [25]:
# What is the highest-averaged algorithm?
averages_2009[0]


Out[25]:
('OGF2.task1.overlap.results.csv', 0.72199999999999998)

In [26]:
# How much better is the best average?
averages_2009[1]


Out[26]:
('MD.task1.overlap.results.csv', 0.71700000000000008)

In [27]:
# Whats the average when you choose the best algorithm for each song?
def best_algo(results, songs):
    test = []
    for song in songs:
        test.append([])

        for algo in results.keys():
            try:
                test[-1].append(results[algo].loc[results[algo]['filename'] == song].Overlap_Score.tolist()[0])
            except IndexError:
                pass
    return [max(rank) for rank in test]

highest_rankings = best_algo(results_2009, songs_2009)

# the average you get if you always choose the best algorithm for each song.
print(np.average(highest_rankings))


0.798358490566

Is there any way to predict which algorithm is best when you run all of them? I have the overlap values, so I cannot look into this for 2009. I'll try 2011.

MIREX 2011; using different stats

For the remainder of this project only the major/minor vocabulary will be considered (MIREX decided to use multiple vocabularies -- essentially multiple ways to round chord approximations -- for the results from this year forward.

They further decided to include additional songs into the dataset, and to relabel all of the songs making cross comparision with 2009 impossible.


In [28]:
results_2011, songs_2011 = competition_results(2011, header=1)

In [29]:
# what columns do we have?
print(*results_2011['CB3.csv'].columns.values, sep=", ", end="")


File, Pairwise score (%), Duration (s), All correct (%), Root correct (%), Type correct (%), All wrong (%), Unique ref chords, Unique test chords

In [30]:
# What is the average pairwise score for each algorithm?
def pairwise_averages(submissions, year):
    averages = []
    for submission in submissions.keys():
        averages.append((submission, np.average(submissions[submission]['Pairwise score (%)'])))
    return sorted(averages, key=lambda x: x[1], reverse=True)

In [31]:
# Why is SB8.csv so bad?
results_2011['SB8.csv']


Out[31]:
File Pairwise score (%) Duration (s) All correct (%) Root correct (%) Type correct (%) All wrong (%) Unique ref chords Unique test chords
0 chord_mrx_09_000000 7.195279 164.310204 7.106907 0.000000 70.430939 20.385091 5 2
1 chord_mrx_09_000001 52.759570 200.129245 52.681304 0.000000 44.123252 0.000000 1 2
2 chord_mrx_09_000002 1.576387 182.143129 1.483939 0.000000 66.615397 28.709145 5 2
3 chord_mrx_09_000003 11.044484 153.469388 10.932808 0.000000 83.055223 0.000000 4 2
4 chord_mrx_09_000004 9.607959 236.088889 9.529460 0.000000 71.812197 16.271073 13 2
5 chord_mrx_09_000005 0.074933 194.841000 0.000000 0.000000 58.720187 32.270929 12 2
6 chord_mrx_09_000006 19.019114 183.771429 18.938346 0.000000 78.085743 0.000000 4 2
7 chord_mrx_09_000007 11.487391 112.875102 11.341121 0.000000 63.319890 20.252870 8 2
8 chord_mrx_09_000008 14.138102 172.329805 14.033722 0.000000 66.378449 16.431711 10 2
9 chord_mrx_09_000009 9.023314 164.022857 8.911832 0.000000 73.425121 17.006623 8 2
10 chord_mrx_09_000010 0.651220 149.394286 0.528534 10.392921 50.206887 35.023514 8 2
11 chord_mrx_09_000011 23.000611 211.487347 22.912971 0.000000 57.449006 16.567853 7 2
12 chord_mrx_09_000012 6.820186 147.513469 6.712081 0.000000 89.834989 -0.000000 3 2
13 chord_mrx_09_000013 11.122332 277.080816 11.064653 0.000000 84.629410 1.521008 8 2
14 chord_mrx_09_000014 0.168635 263.332325 0.111231 0.000000 92.366890 2.213252 5 2
15 chord_mrx_09_000015 4.418220 111.962523 4.257233 1.262347 79.902447 14.087414 10 2
16 chord_mrx_09_000016 5.151430 180.192653 5.071153 2.497201 66.556843 23.156568 9 2
17 chord_mrx_09_000017 17.153528 132.465487 17.017705 0.000000 75.969221 6.266638 8 2
18 chord_mrx_09_000018 0.172080 101.459592 0.000000 0.000000 87.597804 0.000000 3 2
19 chord_mrx_09_000019 0.101974 145.075374 0.000000 0.000000 68.888455 26.959451 5 2
20 chord_mrx_09_000020 0.116436 129.509298 0.000000 19.166614 52.260649 23.608806 6 2
21 chord_mrx_09_000021 1.015114 200.943000 0.926531 0.000000 95.667528 0.000000 5 2
22 chord_mrx_09_000022 0.118135 157.152653 0.000000 0.000000 94.330756 2.179373 4 2
23 chord_mrx_09_000023 5.847838 110.184490 5.694921 0.000000 69.673868 20.604316 4 2
24 chord_mrx_09_000024 6.623289 154.955465 6.527699 0.000000 77.641372 12.609684 8 2
25 chord_mrx_09_000025 1.204043 183.133000 1.116675 0.000000 96.370944 0.000000 6 2
26 chord_mrx_09_000026 0.085191 174.158367 0.000000 0.000000 97.679149 -0.000000 3 2
27 chord_mrx_09_000027 17.849418 160.198986 17.746345 0.000000 47.403349 32.990151 5 2
28 chord_mrx_09_000028 1.386207 347.205000 1.343581 3.851759 67.529851 23.732809 17 2
29 chord_mrx_09_000029 30.076955 110.064000 29.947031 0.000000 46.840111 19.090711 8 2
... ... ... ... ... ... ... ... ... ...
187 chord_mrx_09_000187 5.308567 139.337143 5.195788 0.000000 72.330069 20.064158 8 2
188 chord_mrx_09_000188 17.908689 158.255809 17.798366 6.421083 68.604769 3.768913 8 2
189 chord_mrx_09_000189 0.065453 234.814694 0.000000 6.731220 73.204383 11.594838 6 2
190 chord_mrx_09_000190 68.220510 462.400701 0.000000 0.000000 0.000000 0.000000 0 0
191 chord_mrx_09_000191 0.085963 182.044406 0.000000 0.000000 83.281003 2.136480 4 2
192 chord_mrx_09_000192 7.366594 50.024490 7.070759 0.000000 85.277589 0.000000 3 2
193 chord_mrx_09_000193 2.061102 156.210973 1.966110 0.000000 96.697311 -0.000000 5 2
194 chord_mrx_09_000194 13.523701 181.681633 13.442441 0.000000 32.650698 50.744816 9 2
195 chord_mrx_09_000195 0.139322 127.190204 0.000000 0.000000 77.546440 18.951706 5 2
196 chord_mrx_09_000196 2.970178 117.556605 2.834560 0.000000 65.518071 27.629999 9 2
197 chord_mrx_09_000197 5.381410 144.153140 5.268435 0.000000 88.512362 0.000000 5 2
198 chord_mrx_09_000198 10.017200 78.284271 10.017200 0.000000 74.582889 0.000000 7 2
199 chord_mrx_09_000199 0.318158 52.924082 0.000000 0.000000 98.978973 0.000000 5 2
200 chord_mrx_09_000200 2.901507 125.440000 2.785914 0.000000 78.597618 16.048868 6 2
201 chord_mrx_09_000201 5.386459 172.021000 5.283564 0.000000 56.465292 25.349231 15 2
202 chord_mrx_09_000202 0.116218 134.739592 0.000000 0.000000 98.410101 0.000000 3 2
203 chord_mrx_09_000203 15.510297 125.844446 15.395156 0.000000 74.995384 5.978226 6 2
204 chord_mrx_09_000204 6.195669 125.498415 6.047509 2.386701 67.315629 20.459735 11 2
205 chord_mrx_09_000205 6.696657 128.568888 6.561227 4.082713 46.147822 38.588248 7 2
206 chord_mrx_09_000206 0.068025 220.507000 0.000000 0.000000 94.854132 1.811734 5 2
207 chord_mrx_09_000207 2.371608 153.887347 2.260263 8.762136 32.171564 52.894464 8 2
208 chord_mrx_09_000208 9.433501 110.352834 9.296760 0.000000 65.741520 21.115203 6 2
209 chord_mrx_09_000209 3.569932 227.033107 3.503332 0.000000 80.643016 14.093583 7 2
210 chord_mrx_09_000210 5.052894 262.507000 4.988896 0.000000 91.319012 2.812877 6 2
211 chord_mrx_09_000211 3.879746 150.881000 3.783644 6.029487 59.080467 29.236716 10 2
212 chord_mrx_09_000212 0.119096 143.856327 0.000000 4.602971 86.251888 5.937150 6 2
213 chord_mrx_09_000213 28.989399 134.948571 28.856333 0.000000 47.058283 20.200457 6 2
214 chord_mrx_09_000214 13.338068 121.832199 13.338068 0.000000 76.393606 -0.000000 6 2
215 chord_mrx_09_000215 7.185898 252.187000 7.117298 4.342611 53.925500 26.330263 10 2
216 chord_mrx_09_000216 2.602549 127.582041 2.484162 0.000000 90.687030 2.830102 5 2

217 rows × 9 columns


In [32]:
results_2015, songs_2015 = competition_results(2015, header=1)

In [33]:
print(*results_2015['CM3.csv'].columns.values, sep=", ", end="")


File, Pairwise score (%), Duration (s), Correct chords (%), Substituted chords (%), Deleted chords (%), Inserted chords (%), Correct no-chords (%), maj correct (%), maj proportion (%), min correct (%), min proportion (%), 0 chromas wrong (%), 1 chroma wrong (%), 2 chromas wrong (%), 3 chromas wrong (%), Both correct (%), Only root correct (%), Only type correct (%), Both wrong (%), Unique ref chords, Unique test chords

In [34]:
print("2011")
pairwise_ave_2011 = pairwise_averages(results_2011, 2011)

print("Average pairwise score, by submission:\n")
for team, ave in pairwise_ave_2011:
    print("\t", team, ": ", ave)

print("\nAverage pairwise score, overall:\n")
print(np.average([ave for team, ave in pairwise_ave_2011]))


print("\n\n2015")
pairwise_ave_2015 = pairwise_averages(results_2015, 2015)
print("Average pairwise score, by submission:\n")
for team, ave in pairwise_ave_2015:
    print("\t", team, ": ", ave)

print("\nAverage pairwise score, overall\n")
print(np.average([ave for team, ave in pairwise_ave_2015]))


2011
Average pairwise score, by submission:

	 KO1.csv :  83.0884868341
	 CB4.csv :  82.9067395484
	 CB3.csv :  82.3049815161
	 NMSD2.csv :  81.9755760276
	 NMSD1.csv :  81.3272498479
	 KO2.csv :  80.4467090645
	 NG1.csv :  76.3820656912
	 CF2.csv :  76.1563450737
	 PP3.csv :  76.131154576
	 PP4.csv :  73.7860588664
	 NG2.csv :  71.7758610415
	 SB8.csv :  7.26376459447

Average pairwise score, overall:

72.7954160568


2015
Average pairwise score, by submission:

	 KO1.csv :  83.0884868341
	 DK9.csv :  77.8503528525
	 DK8.csv :  76.9473918802
	 DK7.csv :  76.9473918802
	 DK6.csv :  76.5584058111
	 DK5.csv :  74.1535448525
	 DK4.csv :  68.5614606083
	 CM3.csv :  55.3696745484

Average pairwise score, overall

73.6845886584

Here we see that the highest average pairwise score didn't increase at all from 2011 to 2015. We further see that the overall average of all submissions increased by a mere 0.9%. Finally, we see that this is due to the worst submission from 2011 (SB8.csv) not resubmitting for the 2015 contest. In other words, the algorithms didn't get better, instead, the worst were simply excluded.


In [35]:
compare_all_algo_overlaps(results_2015, songs_2015, 2015, col='Pairwise score (%)')



In [36]:
overlap_sums_2011 = summed_overlaps(results_2011, songs_2011, col='Pairwise score (%)', file="File")
sorted_song_totals_2011 = show_top_bottom_ten(overlap_sums_2011, 2011)

overlap_sums_2015 = summed_overlaps(results_2015, songs_2015, col='Pairwise score (%)', file="File")
sorted_song_totals_2015 = show_top_bottom_ten(overlap_sums_2015, 2015)



In [37]:
def common_songs(yearx, yeary):
    common_songs = []
    songsx = [song for song, score in yearx]
    songsy = [song for song, score in yeary]
    for song in songsx:
        if song in songsy:
            common_songs.append(song)
    return common_songs

In [38]:
print(common_songs(sorted_song_totals_2015[:10], sorted_song_totals_2011[:10]))


['chord_mrx_09_000017', 'chord_mrx_09_000199', 'chord_mrx_09_000050', 'chord_mrx_09_000151', 'chord_mrx_09_000190', 'chord_mrx_09_000031', 'chord_mrx_09_000053', 'chord_mrx_09_000030']

Here we see that 8 of the bottom 10 songs (when their total pairwise scores are tallied) persist from 2011 to 2015. This suggests either: these songs are inherently difficult, or, the submissions are overfit to the dataset.

QUESTION: What are the mistakes made on these songs?

Here I will compare the ground truth file with the output of each submission, and look for common mistakes.

QUESTION: What is the accuracy of predicting which algorithm will be best for given song?

Here we will use the outputs of all algorithms on a song for the feature matrix, and the ground truth as the target.


In [39]:
from sklearn.metrics import accuracy_score

In [40]:
from sklearn.tree import DecisionTreeClassifier

In [41]:
clf = DecisionTreeClassifier()

In [42]:
# need to make the X matrix of features.
# features need to be numbers -- can we use their ascii value of teh string?
# the chords of a song will make up the row of a matrix.
# (as they are listed in the ground truth)

def x_matrix(truth_files):
    '''Return the feature matrix for a single song.
    
    Get each algorithm\'s hashed output and store it in separate rows
    of the returned matrix.'''
    
    file_base_name = 'chord_mrx_09_{:0>6}.lab'
    X = []
    longest_row = 0
    chords = []

    assert os.path.isdir(truth_files), 'no such dir:\n{0}'.format(truth_files)
    
    for song_number in range(217):
        #iter through all songs
        
        chords = []
        f_file = os.path.join(truth_files, file_base_name.format(song_number))
        assert os.path.isfile(f_file), 'no such file:\n{0}'.format(f_file)

        #print(f_file)
        with open(f_file) as infile:
            # iter through all chords in the song
            for line in infile.readlines():
                chords.append(line.split()[-1])

        X.append([hash(chord) for chord in chords])
        longest_row = max(longest_row, len(chords))
    

    return squared_matrix(X, longest_row)


def squared_matrix(M, n):
    for row in M:
        while (len(row) < n):
            row.append(0)
    return np.matrix(M)

In [43]:
truth_files = data_path(2015, d_type='groundtruth')
X = x_matrix(truth_files)
X.shape


Out[43]:
(217, 262)

In [44]:
def get_Y(year_results):
    Y = []
    #algos = ['CM3.csv', 'DK4.csv', 'DK5.csv', 'DK6.csv',
    #        'DK7.csv', 'DK8.csv', 'DK9.csv', 'KO1.csv']
    algos = list(year_results.keys())
    for x in range(217):
        # iter the songs
        song = 'chord_mrx_09_{:0>6}'.format(x)
        #print(song)
        best = ['', -1]
        
        for algo in algos:
            # select the best performing algo on this song
            df = year_results[algo]
            #print('ALGO:', algo)
            score = df.loc[df['File'] == song]['Pairwise score (%)'].tolist()[0]
            #print("ALGO:", algo, '--', score)
            if score > best[-1]:
                # this was a better scoring algo -- replace the current leading
                # pair with this one:
                best = [algo, score]
        
        Y.append(hash(best[0]))  # append the hash of the best scoring algo to Y
    return Y

In [45]:
Y = get_Y(results_2015)

#training the classifier on the first 100 songs:
clf.fit(X[:100], Y[:100])
#
#testing on the last 117 songs in the set:
Y_pred = clf.predict(X[100:])
accuracy_score(Y[100:], Y_pred)


Out[45]:
0.53846153846153844

In [46]:
# This is a pretty good guess. Random is would be 0.125 since there are 8 algorithms

# what would the average be if you chose the best at every song?
def get_best_scores(year_results):
    Y = []
    algos = list(year_results.keys())
    for x in range(217):
        # iter the songs
        song = 'chord_mrx_09_{:0>6}'.format(x)
        #print(song)
        best = ['', -1]
        
        for algo in algos:
            # select the best performing algo on this song
            df = year_results[algo]
            #print('ALGO:', algo)
            score = df.loc[df['File'] == song]['Pairwise score (%)'].tolist()[0]
            #print("ALGO:", algo, '--', score)
            if score > best[-1]:
                # this was a better scoring algo -- replace the current leading
                # pair with this one:
                best = [algo, score]
        
        Y.append(best[-1])  # append the hash of the best scoring algo to Y
    return Y

np.average(get_best_scores(results_2015))


Out[46]:
84.189825179723513

In [47]:
# this is actually not that much better than the best algorithm by itself. 
# What this means is that, for the most part, one algorithm is consistenly the best
# across all songs.

In [48]:
from sklearn import svm
svm_clf = svm.SVC()
svm_clf.fit(X[:100], Y[:100])
y_pred_svm = svm_clf.predict(X[100:])
accuracy_score(Y[100:], y_pred_svm)


Out[48]:
0.76068376068376065

In [49]:
from sklearn.ensemble import RandomForestClassifier
rnd_forset_clf = RandomForestClassifier()
rnd_forset_clf.fit(X[:100], Y[:100])
y_pred_rnd = rnd_forset_clf.predict(X[100:])
accuracy_score(Y[100:], y_pred_rnd)


Out[49]:
0.7350427350427351

In [50]:
# So, with SVM we can guess which algorithm will give the best results 76% of the time.
# this is really high considering random is 12.5%

Perhaps we can look at the average length of a chord name in the ground truth files, the average for the 'bad' songs, for the 'good' songs, and overall.

Here we would consider longer chord names to mean more complicated, therefore unusual, chords.


In [51]:
print(common_songs(sorted_song_totals_2015[:10], sorted_song_totals_2011[:10]))


['chord_mrx_09_000017', 'chord_mrx_09_000199', 'chord_mrx_09_000050', 'chord_mrx_09_000151', 'chord_mrx_09_000190', 'chord_mrx_09_000031', 'chord_mrx_09_000053', 'chord_mrx_09_000030']

In [52]:
def ave_chord_length(root, songs, ext='.lab', algos=None):
    '''Calculate the average chord length for each song, for the set.
    
    This will be weighted by the length of the song in time.'''
    
    avgs_weights = []
    
    assert os.path.isdir(root), 'no such dir: {0}'.format(root)
    if algos is None:
        for song in songs:
            # iter through all songs
            song_file = os.path.join(root, song + ext)
            assert os.path.isfile(song_file), 'no such file: {0}'.format(song_file)
            with open(song_file) as stream:
                song_data = stream.read().split()

            lengths = list([len(chord) for a, b, chord in zip(*[iter(song_data)]*3)])
            #print(lengths)
            average = np.average(lengths)
            weight = float(song_data[-2])

            avgs_weights.append((average, weight))
    else:
        # do as above but average over all algos
        
        for song in songs:
            # iter through all songs
            for algo in algos:
                # iter through each algo's output for this song.
                algo_avgs = []
                song_file = os.path.join(root, algo, song + ext)
                assert os.path.isfile(song_file), 'no such file: {0}'.format(song_file)
                with open(song_file) as stream:
                    song_data = stream.read().split()

                lengths = list([len(chord) for a, b, chord in zip(*[iter(song_data)]*3)])
                average = np.average(lengths)
                weight = float(song_data[-2])
                algo_avgs.append(average)
                avgs_weights.append((np.average(algo_avgs), weight))
    
    return avgs_weights

In [53]:
avg_and_weights_groundtruth_2015 = ave_chord_length(truth_files, songs_2015)

avg_chord_len_groundtruth_2015 = np.average([avg for avg, weight in avg_and_weights_groundtruth_2015],
          weights=[weight for avg, weight in avg_and_weights_groundtruth_2015])

# weight average chord length from 2015 ground truth:
avg_chord_len_groundtruth_2015


Out[53]:
2.9448496051642525

In [54]:
output_files = data_path(2015, d_type='outputs')

avg_and_weights_2015 = ave_chord_length(output_files, songs_2015, ext=".wav.txt",
                                        algos=['CM3', 'DK4', 'DK5', 'DK6', 'DK7', 'DK8', 'DK9', 'KO1'])
avg_chord_len_2015 = np.average([avg for avg, weight in avg_and_weights_2015],
          weights=[weight for avg, weight in avg_and_weights_2015])

#overall average across all song outputs from all submissions:
avg_chord_len_2015


Out[54]:
5.1383852210219807

In [55]:
bad_songs = common_songs(sorted_song_totals_2015[:10], sorted_song_totals_2011[:10])
good_songs = common_songs(sorted_song_totals_2015[-10:], sorted_song_totals_2011[-10:])

In [56]:
bad_avg_and_weights_2015 = ave_chord_length(output_files, bad_songs, ext=".wav.txt",
                                        algos=['CM3', 'DK4', 'DK5', 'DK6', 'DK7', 'DK8', 'DK9', 'KO1'])
bad_avg_chord_len_2015 = np.average([avg for avg, weight in bad_avg_and_weights_2015],
          weights=[weight for avg, weight in bad_avg_and_weights_2015])

#overall average across all 'bad' song outputs from all submissions:
bad_avg_chord_len_2015


Out[56]:
5.0687015911893347

In [57]:
good_avg_and_weights_2015 = ave_chord_length(output_files, good_songs, ext=".wav.txt",
                                        algos=['CM3', 'DK4', 'DK5', 'DK6', 'DK7', 'DK8', 'DK9', 'KO1'])
good_avg_chord_len_2015 = np.average([avg for avg, weight in good_avg_and_weights_2015],
          weights=[weight for avg, weight in good_avg_and_weights_2015])

#overall average across all 'bad' song outputs from all submissions:
good_avg_chord_len_2015


Out[57]:
4.9289542019987191

There isn't much difference between the bottom, top, or all songs average guessed chord length. But, there is a significant difference between the average guessed chord length and the average actual chord length. In fact, it is an approximaly 75% increase in average length. This suggests that over all the algorithms used believe songs to be much more complicated than they usually are.


In [ ]: